In [1]:
from IPython.core.display import display, HTML
from IPython.display import clear_output
display(HTML("<style>.container { width:90% }</style>"))
import warnings
warnings.filterwarnings('ignore')
# ------------------------------------------------------------------

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sample_data
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
In [2]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive

Load Data and check for missing values¶

In [4]:
# load the data into a pandas dataframe

df = pd.read_csv('/content/airbnb_listings_austin.csv')
In [5]:
# preview data
df.info()
df.head
print(df)

# total count of NaN values
print((df.isnull().sum()/df.shape[0])*100)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5835 entries, 0 to 5834
Data columns (total 54 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           5835 non-null   int64  
 1   listing_url                  5835 non-null   object 
 2   name                         5835 non-null   object 
 3   summary                      5373 non-null   object 
 4   space                        4475 non-null   object 
 5   description                  5832 non-null   object 
 6   experiences_offered          5835 non-null   object 
 7   neighborhood_overview        3572 non-null   object 
 8   notes                        2412 non-null   object 
 9   transit                      3492 non-null   object 
 10  host_id                      5835 non-null   int64  
 11  host_name                    5820 non-null   object 
 12  host_since                   5820 non-null   object 
 13  host_location                5810 non-null   object 
 14  host_about                   3974 non-null   object 
 15  host_response_time           4177 non-null   object 
 16  host_response_rate           4177 non-null   object 
 17  host_is_superhost            5820 non-null   object 
 18  host_listings_count          5820 non-null   float64
 19  host_has_profile_pic         5820 non-null   object 
 20  host_identity_verified       5820 non-null   object 
 21  neighbourhood                4800 non-null   object 
 22  city                         5835 non-null   object 
 23  property_type                5835 non-null   object 
 24  room_type                    5835 non-null   object 
 25  accommodates                 5835 non-null   int64  
 26  bathrooms                    5789 non-null   float64
 27  bedrooms                     5829 non-null   float64
 28  beds                         5812 non-null   float64
 29  bed_type                     5835 non-null   object 
 30  amenities                    5835 non-null   object 
 31  square_feet                  302 non-null    float64
 32  price                        5835 non-null   object 
 33  weekly_price                 2227 non-null   object 
 34  security_deposit             2770 non-null   object 
 35  cleaning_fee                 3587 non-null   object 
 36  guests_included              5835 non-null   int64  
 37  extra_people                 5835 non-null   object 
 38  minimum_nights               5835 non-null   int64  
 39  has_availability             5835 non-null   object 
 40  availability_30              5835 non-null   int64  
 41  availability_60              5835 non-null   int64  
 42  availability_90              5835 non-null   int64  
 43  availability_365             5835 non-null   int64  
 44  number_of_reviews            5835 non-null   int64  
 45  review_scores_rating         3789 non-null   float64
 46  review_scores_accuracy       3776 non-null   float64
 47  review_scores_cleanliness    3778 non-null   float64
 48  review_scores_checkin        3778 non-null   float64
 49  review_scores_communication  3778 non-null   float64
 50  review_scores_location       3779 non-null   float64
 51  review_scores_value          3778 non-null   float64
 52  instant_bookable             5835 non-null   object 
 53  cancellation_policy          5835 non-null   object 
dtypes: float64(12), int64(10), object(32)
memory usage: 2.4+ MB
           id                           listing_url  \
0       72635    https://www.airbnb.com/rooms/72635   
1     5386323  https://www.airbnb.com/rooms/5386323   
2     8826517  https://www.airbnb.com/rooms/8826517   
3     8828616  https://www.airbnb.com/rooms/8828616   
4     8536913  https://www.airbnb.com/rooms/8536913   
...       ...                                   ...   
5830  6063670  https://www.airbnb.com/rooms/6063670   
5831  8422925  https://www.airbnb.com/rooms/8422925   
5832  3345881  https://www.airbnb.com/rooms/3345881   
5833  8954997  https://www.airbnb.com/rooms/8954997   
5834  7618185  https://www.airbnb.com/rooms/7618185   

                                   name  \
0         3 Private Bedrooms, SW Austin   
1                       Cricket Trailer   
2        Private room 1 in South Austin   
3        Private room 2 in South Austin   
4             Brand-New 3BR Austin Home   
...                                 ...   
5830     Austin's Downtown Garden Suite   
5831       Two beds in Downtown Austin!   
5832  Casa Romántica en Picos de Europa   
5833               Living room with bed   
5834    Comfy 1 bedroom in North Austin   

                                                summary  \
0     Conveniently located 10-15 from downtown in SW...   
1     Rent this cool concept trailer that has everyt...   
2     Upstairs, private, 12ft x 13 1/2ft room.  Priv...   
3     Upstairs, private, 11ft x 13 1/2ft room.  Priv...   
4     Brand-new 3BR/2BA Austin home with landscaped ...   
...                                                 ...   
5830  Enjoy being literally steps from everything th...   
5831  Prime location for the Austin Convention Cente...   
5832  Axtur: Picos de Europa. Desfiladero del Sella ...   
5833                Living room with bed have bathroom.   
5834                                                NaN   

                                                  space  \
0     We have three spare bedrooms, each with a quee...   
1     Rental arrangements for this trailer allows yo...   
2                                                   NaN   
3                                                   NaN   
4     Feel instantly at home at our brand new 3BR/2B...   
...                                                 ...   
5830  If you are looking for the perfect suite in th...   
5831  Located in the heart of downtown, this room co...   
5832  Una casa excepcional en un paisaje excepcional...   
5833                                                NaN   
5834  Cozy one bedroom/one bath 1st floor apartment ...   

                                            description experiences_offered  \
0     Conveniently located 10-15 from downtown in SW...                none   
1     Rent this cool concept trailer that has everyt...                none   
2     Upstairs, private, 12ft x 13 1/2ft room.  Priv...                none   
3     Upstairs, private, 11ft x 13 1/2ft room.  Priv...                none   
4     Brand-new 3BR/2BA Austin home with landscaped ...                none   
...                                                 ...                 ...   
5830  Enjoy being literally steps from everything th...                none   
5831  Prime location for the Austin Convention Cente...                none   
5832  Una casa excepcional en un paisaje excepcional...                none   
5833                Living room with bed have bathroom.                none   
5834  Cozy one bedroom/one bath 1st floor apartment ...                none   

                                  neighborhood_overview  \
0     Location and convenience are key.  Easy access...   
1     We're talking about wherever you'd like in the...   
2                                                   NaN   
3                                                   NaN   
4     Entertainment and activities are plentiful her...   
...                                                 ...   
5830  I love that the downtown neighborhood is so vi...   
5831  This truly is in the middle of everything goin...   
5832  Pueblecito asturiano, con muy pocos vecinos, d...   
5833                                                NaN   
5834                                                NaN   

                                                  notes  \
0                                                   NaN   
1                                                   NaN   
2                                                   NaN   
3                                                   NaN   
4                                                   NaN   
...                                                 ...   
5830  If you are interested in hosting an even large...   
5831                                                NaN   
5832                            Paisaje y tranquilidad.   
5833                                                NaN   
5834  The security deposit may be forfeited in the e...   

                                                transit  ...  \
0     Unfortunately there is no convenient public tr...  ...   
1     Bike, Bus, Metrorail, etc. you name it we've g...  ...   
2                                                   NaN  ...   
3                                                   NaN  ...   
4                                                   NaN  ...   
...                                                 ...  ...   
5830  In addition to the Airport Flyer that I alread...  ...   
5831  Buses leave from across the street (including ...  ...   
5832                                           En Coche  ...   
5833                                                NaN  ...   
5834  Close to grocery stores, restaurants and a mov...  ...   

      number_of_reviews review_scores_rating review_scores_accuracy  \
0                     1                100.0                   10.0   
1                     0                  NaN                    NaN   
2                     0                  NaN                    NaN   
3                     0                  NaN                    NaN   
4                     0                  NaN                    NaN   
...                 ...                  ...                    ...   
5830                  9                100.0                   10.0   
5831                  0                  NaN                    NaN   
5832                  1                100.0                    8.0   
5833                  0                  NaN                    NaN   
5834                  0                  NaN                    NaN   

     review_scores_cleanliness review_scores_checkin  \
0                         10.0                  10.0   
1                          NaN                   NaN   
2                          NaN                   NaN   
3                          NaN                   NaN   
4                          NaN                   NaN   
...                        ...                   ...   
5830                      10.0                  10.0   
5831                       NaN                   NaN   
5832                      10.0                  10.0   
5833                       NaN                   NaN   
5834                       NaN                   NaN   

     review_scores_communication review_scores_location review_scores_value  \
0                           10.0                   10.0                10.0   
1                            NaN                    NaN                 NaN   
2                            NaN                    NaN                 NaN   
3                            NaN                    NaN                 NaN   
4                            NaN                    NaN                 NaN   
...                          ...                    ...                 ...   
5830                        10.0                   10.0                 9.0   
5831                         NaN                    NaN                 NaN   
5832                        10.0                   10.0                 8.0   
5833                         NaN                    NaN                 NaN   
5834                         NaN                    NaN                 NaN   

      instant_bookable cancellation_policy  
0                    f            moderate  
1                    f            moderate  
2                    f            flexible  
3                    f            flexible  
4                    f              strict  
...                ...                 ...  
5830                 f              strict  
5831                 f            moderate  
5832                 t              strict  
5833                 f            flexible  
5834                 f              strict  

[5835 rows x 54 columns]
id                              0.000000
listing_url                     0.000000
name                            0.000000
summary                         7.917738
space                          23.307626
description                     0.051414
experiences_offered             0.000000
neighborhood_overview          38.783205
notes                          58.663239
transit                        40.154242
host_id                         0.000000
host_name                       0.257069
host_since                      0.257069
host_location                   0.428449
host_about                     31.893745
host_response_time             28.414739
host_response_rate             28.414739
host_is_superhost               0.257069
host_listings_count             0.257069
host_has_profile_pic            0.257069
host_identity_verified          0.257069
neighbourhood                  17.737789
city                            0.000000
property_type                   0.000000
room_type                       0.000000
accommodates                    0.000000
bathrooms                       0.788346
bedrooms                        0.102828
beds                            0.394173
bed_type                        0.000000
amenities                       0.000000
square_feet                    94.824336
price                           0.000000
weekly_price                   61.833762
security_deposit               52.527849
cleaning_fee                   38.526135
guests_included                 0.000000
extra_people                    0.000000
minimum_nights                  0.000000
has_availability                0.000000
availability_30                 0.000000
availability_60                 0.000000
availability_90                 0.000000
availability_365                0.000000
number_of_reviews               0.000000
review_scores_rating           35.064267
review_scores_accuracy         35.287061
review_scores_cleanliness      35.252785
review_scores_checkin          35.252785
review_scores_communication    35.252785
review_scores_location         35.235647
review_scores_value            35.252785
instant_bookable                0.000000
cancellation_policy             0.000000
dtype: float64

Removing symbols¶

In [6]:
# Columns that need dollar sign removal
columns_with_symbols = ['price', 'weekly_price', 'security_deposit', 'cleaning_fee', 'extra_people', 'host_response_rate']

# Iterate through each column
for col in columns_with_symbols:
    # Remove dollar signs, commas, and percent signs, and convert to numeric
    df[col] = df[col].replace({'\$': '', ',': '', '%': ''}, regex=True).astype(float)

drop descriptive columns to prevent misleading data and the null values were small enough to remove¶

In [7]:
cats = ['host_name', 'host_since', 'host_location','summary', 'space', 'description', 'neighborhood_overview', 'notes', 'transit', 'host_about', 'neighbourhood', 'bedrooms', 'bathrooms']
for col in cats:
    df.dropna(subset=[col], inplace=True)

dropping because the null percentage was close to none

In [8]:
df.columns
Out[8]:
Index(['id', 'listing_url', 'name', 'summary', 'space', 'description',
       'experiences_offered', 'neighborhood_overview', 'notes', 'transit',
       'host_id', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_is_superhost',
       'host_listings_count', 'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood', 'city', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet',
       'price', 'weekly_price', 'security_deposit', 'cleaning_fee',
       'guests_included', 'extra_people', 'minimum_nights', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'instant_bookable',
       'cancellation_policy'],
      dtype='object')

Impute means for columns, create dummies, and apply lambdas¶

In [9]:
df['has_availability'] = df['has_availability'].apply(lambda x: 1 if x == 't' else 0)
In [10]:
df['instant_bookable'] = df['instant_bookable'].apply(lambda x: 1 if x == 't' else 0)
In [11]:
df['host_is_superhost'] = df['host_is_superhost'].apply(lambda x: 1 if x == 't' else 0)
In [12]:
df['host_identity_verified'] = df['host_identity_verified'].apply(lambda x: 1 if x == 't' else 0)
In [13]:
df['host_has_profile_pic'] = df['host_has_profile_pic'].apply(lambda x: 1 if x == 't' else 0)
In [15]:
imp = ['host_response_rate','host_listings_count', 'beds', 'square_feet', 'price', 'weekly_price','security_deposit', 'cleaning_fee', 'guests_included', 'extra_people','minimum_nights','availability_30', 'availability_60', 'availability_90', 'availability_365','number_of_reviews', 'review_scores_rating', 'review_scores_accuracy','review_scores_cleanliness', 'review_scores_checkin','review_scores_communication', 'review_scores_location','review_scores_value']
for i in imp:
    df[i] = pd.to_numeric(df[i], errors='coerce')
In [16]:
for col in imp:
    # Calculate mean for each column
    mean_value = df[col].mean()

    # Fill NaN values with the mean
    df[col].fillna(value=mean_value, inplace=True)
In [17]:
df.head()
Out[17]:
id listing_url name summary space description experiences_offered neighborhood_overview notes transit ... number_of_reviews review_scores_rating review_scores_accuracy review_scores_cleanliness review_scores_checkin review_scores_communication review_scores_location review_scores_value instant_bookable cancellation_policy
44 5606440 https://www.airbnb.com/rooms/5606440 LAKEFRONT W DOCK CLOSE TO THE OASIS Large bdrm with full bath directly across the ... Situated on a hill on beautiful Lake Travis. W... Large bdrm with full bath directly across the ... none *Close enough to downtown but away from the ch... We call our home "Club Indigo". We have 3 room... There are no bus stops out by the lake. We can... ... 9 100.00000 10.000000 10.000000 10.00000 10.000000 10.00 10.000000 1 strict
56 4704597 https://www.airbnb.com/rooms/4704597 Hillcountry Reatreat Lake Austin Located in the most desirable Texas Hill Count... This home is walks from the lake and all the f... Located in the most desirable Texas Hill Count... none The area is like a dream serene setting in the... The home is fully furnished with brand new mem... There is no public transport you will need a c... ... 4 100.00000 10.000000 10.000000 10.00000 10.000000 10.00 10.000000 0 strict
58 951773 https://www.airbnb.com/rooms/951773 WATERFRONT- STEINER RANCH/LAKEWAY The lake is full! Come stay ON LAKE TRAVIS! Wa... Situated on a hill on beautiful Lake Travis. W... The lake is full! Come stay ON LAKE TRAVIS! Wa... none *Close enough to downtown but away from the ch... We call our home "Club Indigo". We have 3 room... There are no bus stops out by the lake. We can... ... 10 100.00000 10.000000 10.000000 10.00000 10.000000 10.00 10.000000 0 strict
59 8268970 https://www.airbnb.com/rooms/8268970 Austin Casita SW area This Casita is in an upscale area of SW Austi... Our location is private, yet close to city and... This Casita is in an upscale area of SW Austi... none Our neighborhood is about 35 private homes on ... This is our back house to our home. We have t... If available we will Uber for a discounted rat... ... 0 96.13531 9.700465 9.594419 9.88093 9.893855 9.52 9.481378 1 flexible
81 4404358 https://www.airbnb.com/rooms/4404358 Cozy centrally located couch Welcome to the dopest pad in ATX! This spaciou... We have a little creek outside our apartment a... Welcome to the dopest pad in ATX! This spaciou... none We live in the North loop area which is the mo... My roommate and I are pretty laid-back people,... There's free parking on-site, and if you wante... ... 2 80.00000 9.000000 9.000000 10.00000 9.000000 9.00 9.000000 0 moderate

5 rows × 54 columns

Model 1: Linear Regression¶

Fit full regression

In [18]:
import pandas as pd

def extract_numeric_features(df):
    numeric_features = df.select_dtypes(include=['number'])  # Select columns with numeric data types
    return numeric_features
# Assuming df_filtered is your DataFrame
numeric_df = extract_numeric_features(df)
In [19]:
#create for loop to produce graph for each numeric feat
for feature in numeric_df:

  # Set the figure size
  plt.figure(figsize=(10, 3))
  # Create the histogram using seaborn
  sns.histplot(df,x=feature, bins=30, kde=True, color='skyblue')
  # Set the title and x-label
  plt.title(f'Histogram of {feature}')
  plt.xlabel(feature)
  # Display the plot
  plt.show()
In [20]:
#understand relationship between features and target var

for feature in numeric_df:
  if feature != 'price': # not include target var
    plt.figure(figsize=(10, 3))
    sns.scatterplot(df,x=feature, y='price')
    # if you want a linear line
    # sns.regplot(wake,x=feature, y='sale_price', scatter_kws={'s': 50, 'color': 'blue'}, line_kws={'color': 'red'})
    plt.title(f'Scatter Plot of price = {feature} ')
    plt.xlabel(feature)
    plt.ylabel('price')
    # Display the plot
    plt.show()
In [21]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate correlations between 'price' and other variables
price_corr = numeric_df.corr()['price'].sort_values(ascending=False)

# Select top correlated features (adjust the number as needed)
top_corr_features = price_corr.head(10)  # For example, considering top 10 correlated features

# Create a new DataFrame with top correlated features
df_top_corr = df[top_corr_features.index]

# Calculate correlations between selected features
df_correlations = df_top_corr.corr()

# Plot correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df_correlations, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap against Price')
plt.show()
In [22]:
import pandas as pd

# Assuming 'df' is your DataFrame containing Airbnb rental data
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

for column in categorical_columns:
    unique_values_count = df[column].nunique()
    print(f"Column '{column}' has {unique_values_count} unique values.")
Column 'listing_url' has 1307 unique values.
Column 'name' has 1301 unique values.
Column 'summary' has 1275 unique values.
Column 'space' has 1294 unique values.
Column 'description' has 1302 unique values.
Column 'experiences_offered' has 1 unique values.
Column 'neighborhood_overview' has 1217 unique values.
Column 'notes' has 1215 unique values.
Column 'transit' has 1225 unique values.
Column 'host_name' has 660 unique values.
Column 'host_since' has 758 unique values.
Column 'host_location' has 41 unique values.
Column 'host_about' has 1040 unique values.
Column 'host_response_time' has 4 unique values.
Column 'neighbourhood' has 70 unique values.
Column 'city' has 3 unique values.
Column 'property_type' has 16 unique values.
Column 'room_type' has 3 unique values.
Column 'bed_type' has 5 unique values.
Column 'amenities' has 1142 unique values.
Column 'cancellation_policy' has 4 unique values.

Start fitting regression¶

In [23]:
# Your list of columns to consider for prediction
pred_price = [ 'price',
    'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
    'square_feet', 'host_response_time', 'host_response_rate', 'host_is_superhost',
    'neighbourhood', 'city', 'review_scores_rating', 'review_scores_accuracy',
    'review_scores_cleanliness', 'security_deposit', 'cleaning_fee', 'extra_people',
    'cancellation_policy', 'minimum_nights', 'availability_30', 'availability_60',
    'availability_90', 'availability_365'
]

# Filter out object-type columns
numerical_columns = df[pred_price].select_dtypes(exclude=['object'])

# Display the numerical columns for analysis
print(numerical_columns)
      price  accommodates  bathrooms  bedrooms  beds  square_feet  \
44     85.0             2        1.0       1.0   1.0  1146.732558   
56    599.0            16        3.0       5.0  11.0  1146.732558   
58     85.0             2        1.0       1.0   1.0  1146.732558   
59    110.0             2        1.0       1.0   1.0  1146.732558   
81     75.0             1        1.0       1.0   2.0  1146.732558   
...     ...           ...        ...       ...   ...          ...   
5810   99.0             2        1.0       1.0   1.0  1146.732558   
5815   49.0             1        1.0       1.0   2.0  1146.732558   
5819  185.0             4        1.0       1.0   1.0  1146.732558   
5827  500.0             8        3.0       4.0   4.0  1146.732558   
5830  179.0             4        1.0       1.0   2.0  1146.732558   

      host_response_rate  host_is_superhost  review_scores_rating  \
44            100.000000                  1             100.00000   
56            100.000000                  0             100.00000   
58            100.000000                  1             100.00000   
59            100.000000                  0              96.13531   
81             95.265471                  0              80.00000   
...                  ...                ...                   ...   
5810           86.000000                  0              91.00000   
5815           86.000000                  0              93.00000   
5819          100.000000                  1              94.00000   
5827           87.000000                  0              94.00000   
5830          100.000000                  1             100.00000   

      review_scores_accuracy  review_scores_cleanliness  security_deposit  \
44                 10.000000                  10.000000        100.000000   
56                 10.000000                  10.000000        366.686732   
58                 10.000000                  10.000000        100.000000   
59                  9.700465                   9.594419        366.686732   
81                  9.000000                   9.000000        366.686732   
...                      ...                        ...               ...   
5810                9.000000                   9.000000         95.000000   
5815                9.000000                  10.000000         95.000000   
5819               10.000000                   9.000000        150.000000   
5827                9.000000                   9.000000        500.000000   
5830               10.000000                  10.000000        500.000000   

      cleaning_fee  extra_people  minimum_nights  availability_30  \
44       25.000000          25.0               2               25   
56      300.000000          35.0               2               26   
58       25.000000          25.0               2               26   
59       70.729424           0.0               1               17   
81        5.000000           0.0               1               28   
...            ...           ...             ...              ...   
5810     25.000000          25.0               3               17   
5815     25.000000          49.0               3               24   
5819     70.000000          10.0               1               21   
5827    180.000000           0.0               2               19   
5830    125.000000          50.0               1               15   

      availability_60  availability_90  availability_365  
44                 42               72               341  
56                 56               86               361  
58                 56               86               361  
59                 47               77               352  
81                 58               88                88  
...               ...              ...               ...  
5810               47               77                78  
5815               54               84                84  
5819               51               81                81  
5827               49               76               343  
5830               45               65               340  

[1307 rows x 19 columns]
In [24]:
# Fit full regression
import statsmodels.api as sm


X = numerical_columns.drop(columns=['price'])
y = numerical_columns['price']
X = sm.add_constant(X)
fullreg = sm.OLS(y, X).fit()

# Display regression summary
print(fullreg.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.504
Model:                            OLS   Adj. R-squared:                  0.497
Method:                 Least Squares   F-statistic:                     72.75
Date:                Tue, 12 Dec 2023   Prob (F-statistic):          2.15e-181
Time:                        15:13:40   Log-Likelihood:                -8665.7
No. Observations:                1307   AIC:                         1.737e+04
Df Residuals:                    1288   BIC:                         1.747e+04
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
=============================================================================================
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                      -456.1365    126.614     -3.603      0.000    -704.530    -207.743
accommodates                  8.9360      3.852      2.320      0.021       1.378      16.494
bathrooms                    91.7108     12.051      7.610      0.000      68.070     115.352
bedrooms                     33.8172      9.371      3.609      0.000      15.432      52.202
beds                         -7.4885      6.256     -1.197      0.232     -19.762       4.785
square_feet                   0.0932      0.027      3.493      0.000       0.041       0.146
host_response_rate            0.2735      0.500      0.547      0.584      -0.707       1.254
host_is_superhost           -32.2639     12.216     -2.641      0.008     -56.230      -8.298
review_scores_rating          5.0531      1.498      3.374      0.001       2.115       7.991
review_scores_accuracy      -26.3594     11.955     -2.205      0.028     -49.812      -2.907
review_scores_cleanliness    -5.2379      9.884     -0.530      0.596     -24.628      14.152
security_deposit              0.1758      0.021      8.294      0.000       0.134       0.217
cleaning_fee                  1.1637      0.155      7.524      0.000       0.860       1.467
extra_people                  0.3612      0.151      2.398      0.017       0.066       0.657
minimum_nights                1.3601      1.757      0.774      0.439      -2.086       4.806
availability_30               2.4161      1.338      1.806      0.071      -0.209       5.041
availability_60               1.4452      1.291      1.119      0.263      -1.088       3.978
availability_90              -2.1338      0.678     -3.149      0.002      -3.463      -0.804
availability_365              0.1687      0.057      2.982      0.003       0.058       0.280
==============================================================================
Omnibus:                     1161.545   Durbin-Watson:                   1.980
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            61592.794
Skew:                           3.880   Prob(JB):                         0.00
Kurtosis:                      35.723   Cond. No.                     3.13e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.13e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [25]:
vifres = pd.DataFrame()
vifres["Variable"] = X.columns
vifres["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vifres)
                     Variable         VIF
0                       const  614.393013
1                accommodates    4.434816
2                   bathrooms    2.534495
3                    bedrooms    4.017986
4                        beds    4.058308
5                 square_feet    1.075079
6          host_response_rate    1.047838
7           host_is_superhost    1.117366
8        review_scores_rating    1.891659
9      review_scores_accuracy    1.741378
10  review_scores_cleanliness    1.836615
11           security_deposit    1.315338
12               cleaning_fee    1.880256
13               extra_people    1.074484
14             minimum_nights    1.015268
15            availability_30    8.595237
16            availability_60   29.047504
17            availability_90   17.153156
18           availability_365    1.614315
In [26]:
def stepwise_selection(X, y,
                       initial_list=[],
                       threshold_in=0.01,
                       threshold_out = 0.05,
                       verbose=True):
    """ Perform a forward-backward feature selection
    based on p-value from statsmodels.api.OLS

    Arguments:
        X - pandas.DataFrame of numeric features
        y - vector, series of the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions

    Returns: list of selected features

    Example Call: stepwise_selection(X, y)
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print(f'Add  {best_feature} with p-value {best_pval:.4f}')
        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print(f'Drop {worst_feature} with p-value {worst_pval:.4f}')
        if not changed:
           break
    return included
In [27]:
selected_features  = stepwise_selection(X, y)
print('resulting features:')
print(selected_features)
Add  const with p-value 0.0000
Add  bathrooms with p-value 0.0000
Add  cleaning_fee with p-value 0.0000
Add  security_deposit with p-value 0.0000
Add  bedrooms with p-value 0.0000
Add  square_feet with p-value 0.0009
resulting features:
['const', 'bathrooms', 'cleaning_fee', 'security_deposit', 'bedrooms', 'square_feet']
In [28]:
# Fit stepwise regression
X = numerical_columns[['bathrooms', 'cleaning_fee', 'security_deposit', 'bedrooms', 'square_feet']]
X = sm.add_constant(X)
stepreg = sm.OLS(y, X).fit()

# Display regression summary
print(stepreg.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.484
Model:                            OLS   Adj. R-squared:                  0.482
Method:                 Least Squares   F-statistic:                     243.6
Date:                Tue, 12 Dec 2023   Prob (F-statistic):          8.78e-184
Time:                        15:14:21   Log-Likelihood:                -8692.2
No. Observations:                1307   AIC:                         1.740e+04
Df Residuals:                    1301   BIC:                         1.743e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const             -240.1393     30.882     -7.776      0.000    -300.723    -179.556
bathrooms           93.9019     12.087      7.769      0.000      70.190     117.613
cleaning_fee         1.2744      0.153      8.315      0.000       0.974       1.575
security_deposit     0.1824      0.021      8.550      0.000       0.141       0.224
bedrooms            43.6069      7.698      5.665      0.000      28.505      58.709
square_feet          0.0901      0.027      3.342      0.001       0.037       0.143
==============================================================================
Omnibus:                     1146.638   Durbin-Watson:                   1.988
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            56111.444
Skew:                           3.831   Prob(JB):                         0.00
Kurtosis:                      34.171   Cond. No.                     7.30e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.3e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [29]:
vifresstep = pd.DataFrame()
vifresstep["Variable"] = X.columns
vifresstep["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vifresstep)
           Variable        VIF
0             const  35.447970
1         bathrooms   2.472837
2      cleaning_fee   1.790782
3  security_deposit   1.291727
4          bedrooms   2.629538
5       square_feet   1.063576
In [30]:
#resid
residuals = stepreg.resid

#Q-Q Plot
fig = sm.qqplot(residuals,fit=True, line='45')
plt.show()


# resid vs fitted
plt.figure(figsize=(10, 6))
plt.scatter(stepreg.fittedvalues, residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted Values')
plt.axhline(y=0, color='r', linestyle='--')  # Add a horizontal line at y=0
plt.grid(True)
plt.show()
In [31]:
#trying a log transformation
X = numerical_columns[['bathrooms', 'cleaning_fee', 'security_deposit', 'bedrooms', 'square_feet']]
X = sm.add_constant(X)
logy = np.sqrt(y)
logreg = sm.OLS(logy, X).fit()

# Display regression summary
print(logreg.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.560
Model:                            OLS   Adj. R-squared:                  0.558
Method:                 Least Squares   F-statistic:                     330.9
Date:                Tue, 12 Dec 2023   Prob (F-statistic):          7.90e-229
Time:                        15:14:27   Log-Likelihood:                -3739.9
No. Observations:                1307   AIC:                             7492.
Df Residuals:                    1301   BIC:                             7523.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
====================================================================================
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                2.8384      0.698      4.064      0.000       1.468       4.209
bathrooms            1.9804      0.273      7.245      0.000       1.444       2.517
cleaning_fee         0.0322      0.003      9.290      0.000       0.025       0.039
security_deposit     0.0030      0.000      6.304      0.000       0.002       0.004
bedrooms             1.8859      0.174     10.832      0.000       1.544       2.227
square_feet          0.0013      0.001      2.060      0.040    6.02e-05       0.002
==============================================================================
Omnibus:                      477.581   Durbin-Watson:                   1.898
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2985.815
Skew:                           1.558   Prob(JB):                         0.00
Kurtosis:                       9.717   Cond. No.                     7.30e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.3e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [32]:
# recalc resid
residuals = logreg.resid

fig = sm.qqplot(residuals, fit=True, line='45')
plt.show()


plt.figure(figsize=(10, 6))
plt.scatter(logreg.fittedvalues, residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted Values')
plt.axhline(y=0, color='r', linestyle='--')  # Add a horizontal line at y=0
plt.grid(True)
plt.show()
In [33]:
object_columns = df[pred_price].select_dtypes(include=['object'])

# Create box plots for object-type columns against 'price'
for column in object_columns.columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=column, y='price', data=df)
    plt.title(f'Boxplot of {column} against Price')
    plt.xticks(rotation=45)
    plt.xlabel(column)
    plt.ylabel('Price')
    plt.show()
In [34]:
#trying a log transformation

# Filter out object-type columns and create dummy variables
object_columns = df[pred_price].select_dtypes(include=['object'])
dummy_columns = pd.get_dummies(object_columns, drop_first=True)  # Use drop_first to avoid multicollinearity

# Select numerical columns
n = numerical_columns[['bathrooms', 'cleaning_fee', 'security_deposit', 'bedrooms', 'square_feet']]

# Concatenate numerical columns with dummy variables
temp = pd.concat([n, dummy_columns], axis=1)
X = sm.add_constant(temp)
logy = np.sqrt(y)
logreg = sm.OLS(logy, X).fit()

# Display regression summary
print(logreg.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.700
Model:                            OLS   Adj. R-squared:                  0.676
Method:                 Least Squares   F-statistic:                     28.78
Date:                Tue, 12 Dec 2023   Prob (F-statistic):          7.35e-251
Time:                        15:14:34   Log-Likelihood:                -3489.1
No. Observations:                1307   AIC:                             7176.
Df Residuals:                    1208   BIC:                             7689.
Df Model:                          98                                         
Covariance Type:            nonrobust                                         
============================================================================================================
                                               coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------------
const                                        6.3114      1.115      5.662      0.000       4.124       8.498
bathrooms                                    2.1495      0.248      8.656      0.000       1.662       2.637
cleaning_fee                                 0.0193      0.003      5.960      0.000       0.013       0.026
security_deposit                             0.0028      0.000      6.534      0.000       0.002       0.004
bedrooms                                     1.5192      0.171      8.888      0.000       1.184       1.855
square_feet                                  0.0013      0.001      2.404      0.016       0.000       0.002
property_type_Bed & Breakfast                2.3648      2.648      0.893      0.372      -2.830       7.560
property_type_Boat                          -2.3787      3.666     -0.649      0.517      -9.571       4.813
property_type_Bungalow                       0.8287      2.231      0.371      0.710      -3.549       5.206
property_type_Cabin                          0.8140      1.444      0.564      0.573      -2.019       3.647
property_type_Camper/RV                     -0.9860      1.358     -0.726      0.468      -3.650       1.678
property_type_Chalet                         2.1302      3.676      0.580      0.562      -5.081       9.342
property_type_Condominium                   -0.6283      0.869     -0.723      0.470      -2.334       1.077
property_type_Earth House                   -0.0326      3.690     -0.009      0.993      -7.272       7.207
property_type_House                          0.6926      0.306      2.262      0.024       0.092       1.293
property_type_Loft                           1.0908      0.749      1.456      0.146      -0.379       2.561
property_type_Other                         -0.1341      1.148     -0.117      0.907      -2.387       2.118
property_type_Tent                          -3.8704      3.724     -1.039      0.299     -11.176       3.436
property_type_Tipi                          -0.3563      3.774     -0.094      0.925      -7.761       7.048
property_type_Townhouse                     -0.7478      1.129     -0.663      0.508      -2.962       1.467
property_type_Villa                         -1.2836      3.692     -0.348      0.728      -8.528       5.960
room_type_Private room                      -2.5467      0.294     -8.664      0.000      -3.123      -1.970
room_type_Shared room                       -4.4068      0.846     -5.207      0.000      -6.067      -2.747
host_response_time_within a day             -3.6206      0.409     -8.858      0.000      -4.422      -2.819
host_response_time_within a few hours       -4.0718      0.344    -11.835      0.000      -4.747      -3.397
host_response_time_within an hour           -4.2225      0.316    -13.345      0.000      -4.843      -3.602
neighbourhood_Anderson Mill                  0.2248      1.722      0.131      0.896      -3.153       3.602
neighbourhood_Balcones Civic Association     3.2489      2.024      1.605      0.109      -0.723       7.221
neighbourhood_Barton Creek                   2.9882      2.727      1.096      0.273      -2.362       8.338
neighbourhood_Barton Hills                   2.3778      1.021      2.329      0.020       0.375       4.381
neighbourhood_Bouldin Creek                  2.7877      1.038      2.685      0.007       0.751       4.825
neighbourhood_Brentwood                      0.7645      1.157      0.661      0.509      -1.505       3.034
neighbourhood_Bryker Woods                   3.0245      2.285      1.324      0.186      -1.458       7.507
neighbourhood_Bull Creek                    -0.0473      1.635     -0.029      0.977      -3.255       3.160
neighbourhood_Cherry Creek                  -0.4194      1.062     -0.395      0.693      -2.504       1.665
neighbourhood_Cherrywood                    -6.4792      3.832     -1.691      0.091     -13.997       1.038
neighbourhood_Clarksville                    3.0547      1.147      2.663      0.008       0.804       5.305
neighbourhood_Copperfield                   -2.9738      3.755     -0.792      0.429     -10.341       4.393
neighbourhood_Crestview                     -1.3549      1.503     -0.901      0.368      -4.304       1.595
neighbourhood_Dawson                         1.6165      1.043      1.549      0.122      -0.431       3.664
neighbourhood_Downtown                       3.4356      1.047      3.282      0.001       1.382       5.489
neighbourhood_East Congress                 -0.0994      1.452     -0.068      0.945      -2.949       2.750
neighbourhood_East Downtown                  3.2923      0.919      3.583      0.000       1.489       5.095
neighbourhood_East Riverside                 0.4549      1.046      0.435      0.664      -1.597       2.507
neighbourhood_Galindo                        0.4761      1.156      0.412      0.681      -1.792       2.744
neighbourhood_Govalle                        1.8489      1.102      1.677      0.094      -0.314       4.011
neighbourhood_Gracywoods                    -1.6309      2.302     -0.708      0.479      -6.147       2.886
neighbourhood_Hancock                        0.1647      1.120      0.147      0.883      -2.032       2.361
neighbourhood_Highland                       0.8177      1.330      0.615      0.539      -1.792       3.427
neighbourhood_Holly                          2.3920      1.017      2.353      0.019       0.397       4.387
neighbourhood_Hyde Park                      1.2182      1.155      1.055      0.292      -1.048       3.484
neighbourhood_Lamplight Village              0.5197      2.286      0.227      0.820      -3.965       5.005
neighbourhood_Long Canyon                    4.1144      2.029      2.027      0.043       0.133       8.096
neighbourhood_MLK & 183                     -0.2457      1.154     -0.213      0.831      -2.509       2.018
neighbourhood_McKinney                       0.2733      1.288      0.212      0.832      -2.253       2.800
neighbourhood_Milwood                       -1.0132      2.723     -0.372      0.710      -6.355       4.328
neighbourhood_Montopolis                    -2.3505      1.842     -1.276      0.202      -5.965       1.264
neighbourhood_Mueller                        0.4598      1.264      0.364      0.716      -2.020       2.939
neighbourhood_North Loop                     1.9775      1.284      1.540      0.124      -0.541       4.496
neighbourhood_North Shoal Creek              0.0873      2.016      0.043      0.965      -3.868       4.043
neighbourhood_Northwest Hills               -0.2856      1.371     -0.208      0.835      -2.975       2.404
neighbourhood_Oak Hill                       0.4319      1.770      0.244      0.807      -3.040       3.904
neighbourhood_Old Enfield                    3.9542      2.028      1.950      0.051      -0.025       7.933
neighbourhood_Old West Austin                4.0740      1.065      3.825      0.000       1.985       6.163
neighbourhood_Parker Lane                    0.0921      1.206      0.076      0.939      -2.273       2.457
neighbourhood_Pecan Spings                   0.6457      1.646      0.392      0.695      -2.584       3.876
neighbourhood_Pleasant Valley                1.9377      1.634      1.186      0.236      -1.268       5.144
neighbourhood_Rainey Street                  3.5942      3.838      0.936      0.349      -3.936      11.125
neighbourhood_Rollingwood                    6.9974      2.955      2.368      0.018       1.201      12.794
neighbourhood_Rosedale                       2.7050      1.239      2.183      0.029       0.273       5.137
neighbourhood_Rosewood                      -0.0064      1.064     -0.006      0.995      -2.093       2.081
neighbourhood_SW Williamson Co.             -1.0566      2.286     -0.462      0.644      -5.542       3.429
neighbourhood_South Congress                 3.4130      1.159      2.945      0.003       1.139       5.687
neighbourhood_South First                    3.3920      1.075      3.154      0.002       1.282       5.502
neighbourhood_South Lamar                    2.0646      1.033      1.999      0.046       0.039       4.090
neighbourhood_South Manchaca                 0.8765      1.179      0.743      0.457      -1.437       3.190
neighbourhood_St. Edwards                    1.0860      1.263      0.860      0.390      -1.392       3.564
neighbourhood_St. Johns                      0.2640      2.744      0.096      0.923      -5.120       5.648
neighbourhood_Steiner Ranch                  2.2251      2.284      0.974      0.330      -2.256       6.706
neighbourhood_Sunset Valley                  1.4631      1.011      1.447      0.148      -0.521       3.448
neighbourhood_Tarrytown                      2.3959      1.232      1.945      0.052      -0.021       4.813
neighbourhood_Travis Heights                 2.8197      0.966      2.919      0.004       0.924       4.715
neighbourhood_University Hills              -0.1766      2.726     -0.065      0.948      -5.526       5.172
neighbourhood_University of Texas            1.6276      1.268      1.283      0.200      -0.861       4.116
neighbourhood_Upper Boggy Creek              1.5244      1.176      1.296      0.195      -0.783       3.832
neighbourhood_Walnut Creek                   2.8406      3.744      0.759      0.448      -4.504      10.185
neighbourhood_West Austin                   -0.8102      2.721     -0.298      0.766      -6.148       4.528
neighbourhood_West Campus                    3.1710      1.305      2.431      0.015       0.611       5.730
neighbourhood_West Congress                 -0.7815      1.554     -0.503      0.615      -3.830       2.267
neighbourhood_Westgate                      -0.3043      2.284     -0.133      0.894      -4.785       4.177
neighbourhood_Westlake Hills                 2.0257      1.681      1.205      0.228      -1.273       5.324
neighbourhood_Windsor Hills                 -1.7916      3.740     -0.479      0.632      -9.130       5.546
neighbourhood_Windsor Park                  -0.1799      1.280     -0.141      0.888      -2.690       2.331
neighbourhood_Wooten                        -1.6602      2.051     -0.809      0.419      -5.685       2.365
neighbourhood_Zilker                         3.0918      0.967      3.198      0.001       1.195       4.989
city_Sunset Valley                           1.4631      1.011      1.447      0.148      -0.521       3.448
city_West Lake Hills                        -3.4568      2.309     -1.497      0.135      -7.987       1.074
cancellation_policy_moderate                -0.0754      0.311     -0.243      0.808      -0.685       0.534
cancellation_policy_strict                  -0.3984      0.296     -1.346      0.178      -0.979       0.182
cancellation_policy_super_strict_30         14.8458      2.655      5.592      0.000       9.637      20.055
==============================================================================
Omnibus:                      361.904   Durbin-Watson:                   2.021
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2338.457
Skew:                           1.121   Prob(JB):                         0.00
Kurtosis:                       9.158   Cond. No.                     2.00e+19
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 4.92e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [35]:
selected_features  = stepwise_selection(X, logy)
print('resulting features:')
print(selected_features)
Add  const with p-value 0.0000
Add  bedrooms with p-value 0.0000
Add  cleaning_fee with p-value 0.0000
Add  room_type_Private room with p-value 0.0000
Add  bathrooms with p-value 0.0000
Add  security_deposit with p-value 0.0000
Add  host_response_time_within an hour with p-value 0.0000
Add  host_response_time_within a few hours with p-value 0.0000
Add  host_response_time_within a day with p-value 0.0000
Add  room_type_Shared room with p-value 0.0000
Add  cancellation_policy_super_strict_30 with p-value 0.0000
Add  neighbourhood_East Downtown with p-value 0.0000
Add  neighbourhood_Old West Austin with p-value 0.0001
Add  neighbourhood_Zilker with p-value 0.0012
Add  neighbourhood_South First with p-value 0.0044
Add  neighbourhood_Travis Heights with p-value 0.0036
Add  neighbourhood_Downtown with p-value 0.0018
Add  square_feet with p-value 0.0049
Add  neighbourhood_Bouldin Creek with p-value 0.0044
Add  neighbourhood_South Congress with p-value 0.0082
Add  neighbourhood_Holly with p-value 0.0086
resulting features:
['const', 'bedrooms', 'cleaning_fee', 'room_type_Private room', 'bathrooms', 'security_deposit', 'host_response_time_within an hour', 'host_response_time_within a few hours', 'host_response_time_within a day', 'room_type_Shared room', 'cancellation_policy_super_strict_30', 'neighbourhood_East Downtown', 'neighbourhood_Old West Austin', 'neighbourhood_Zilker', 'neighbourhood_South First', 'neighbourhood_Travis Heights', 'neighbourhood_Downtown', 'square_feet', 'neighbourhood_Bouldin Creek', 'neighbourhood_South Congress', 'neighbourhood_Holly']
In [36]:
#trying a log transformation
X = temp[['bedrooms', 'cleaning_fee', 'room_type_Private room', 'bathrooms', 'security_deposit', 'host_response_time_within an hour', 'host_response_time_within a few hours', 'host_response_time_within a day', 'room_type_Shared room', 'cancellation_policy_super_strict_30', 'neighbourhood_East Downtown', 'neighbourhood_Old West Austin', 'neighbourhood_Zilker', 'neighbourhood_South First', 'neighbourhood_Travis Heights', 'neighbourhood_Downtown', 'square_feet', 'neighbourhood_Bouldin Creek', 'neighbourhood_South Congress', 'neighbourhood_Holly']]

X = sm.add_constant(X)
logy = np.sqrt(y)
logreg = sm.OLS(logy, X).fit()

# Display regression summary
print(logreg.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.675
Model:                            OLS   Adj. R-squared:                  0.669
Method:                 Least Squares   F-statistic:                     133.3
Date:                Tue, 12 Dec 2023   Prob (F-statistic):          5.48e-296
Time:                        15:14:51   Log-Likelihood:                -3542.6
No. Observations:                1307   AIC:                             7127.
Df Residuals:                    1286   BIC:                             7236.
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
=========================================================================================================
                                            coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------
const                                     7.1945      0.674     10.674      0.000       5.872       8.517
bedrooms                                  1.4738      0.158      9.334      0.000       1.164       1.784
cleaning_fee                              0.0210      0.003      6.771      0.000       0.015       0.027
room_type_Private room                   -2.6253      0.257    -10.233      0.000      -3.129      -2.122
bathrooms                                 2.2529      0.238      9.472      0.000       1.786       2.720
security_deposit                          0.0026      0.000      6.226      0.000       0.002       0.003
host_response_time_within an hour        -4.1493      0.302    -13.726      0.000      -4.742      -3.556
host_response_time_within a few hours    -3.8902      0.332    -11.733      0.000      -4.541      -3.240
host_response_time_within a day          -3.4859      0.396     -8.807      0.000      -4.262      -2.709
room_type_Shared room                    -4.7189      0.792     -5.955      0.000      -6.273      -3.164
cancellation_policy_super_strict_30      16.6401      2.602      6.396      0.000      11.536      21.744
neighbourhood_East Downtown               2.3487      0.345      6.815      0.000       1.673       3.025
neighbourhood_Old West Austin             2.9280      0.632      4.629      0.000       1.687       4.169
neighbourhood_Zilker                      2.0090      0.457      4.401      0.000       1.113       2.905
neighbourhood_South First                 2.3657      0.654      3.615      0.000       1.082       3.650
neighbourhood_Travis Heights              1.6817      0.452      3.719      0.000       0.795       2.569
neighbourhood_Downtown                    2.1049      0.569      3.697      0.000       0.988       3.222
square_feet                               0.0015      0.001      2.774      0.006       0.000       0.003
neighbourhood_Bouldin Creek               1.8736      0.600      3.123      0.002       0.697       3.050
neighbourhood_South Congress              2.1718      0.782      2.778      0.006       0.638       3.705
neighbourhood_Holly                       1.4566      0.553      2.632      0.009       0.371       2.542
==============================================================================
Omnibus:                      349.384   Durbin-Watson:                   1.993
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2031.623
Skew:                           1.109   Prob(JB):                         0.00
Kurtosis:                       8.691   Cond. No.                     3.14e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.14e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
In [37]:
# recalc resid
residuals = logreg.resid

fig = sm.qqplot(residuals, fit=True, line='45')
plt.show()


plt.figure(figsize=(10, 6))
plt.scatter(logreg.fittedvalues, residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted Values')
plt.axhline(y=0, color='r', linestyle='--')  # Add a horizontal line at y=0
plt.grid(True)
plt.show()

Logistic Regression¶

In [38]:
from IPython.core.display import display, HTML
from IPython.display import clear_output
display(HTML("<style>.container { width:90% }</style>"))
import warnings
warnings.filterwarnings('ignore')
# ------------------------------------------------------------------

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.feature_selection import SelectFromModel
In [39]:
# function to calculate perofrmance from sklearn logistic regression models
def performance_rpt(model, X_test, y_test):
    # Calculate performance metrics using the provided model
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    y_true = y_test
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_true, y_pred, pos_label=1)
    recall = recall_score(y_true, y_pred, pos_label=1)
    aucroc = roc_auc_score(y_test, y_proba)

    # Print the performance metrics
    #print(" -- Model Performance on Test Set --")
    print(f"Accuracy : {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")
    print(f"AUC-ROC  : {aucroc:.4f}\n")

    return accuracy, precision, recall, aucroc
In [40]:
# function for model coefficents from sklearn logistic regression
def rpt_model_variables(model):
    # Get the intercept term
    intercept = model.intercept_

    # Access the coefficients (weights) of the model, i rounded them
    coefficients = np.round(model.coef_[0],decimals=4)

    # Create DataFrames for intercept and coefficients
    #df_intercept = pd.DataFrame({'feature': ['Intercept'], 'coefficient': [intercept[0]]})
    df_coefficients = pd.DataFrame({'feature': X2.columns, 'coefficient': coefficients})
    df_coefficients['abs_coefficient'] = df_coefficients['coefficient'].abs()
    df_coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)

    # if you want to add intercept to table
    #df_model = pd.concat([df_intercept, df_coefficients]).reset_index(drop=True)

    # Print the DataFrame
    print(df_coefficients)

    return df_coefficients
In [41]:
# plot variable importance for sklearn logistic regression
def plot_variable_imp(df_coef):
  df_plt = df_coef[df_coef['abs_coefficient'] != 0]
  reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()

  plt.figure(figsize=(5, 10))
  plt.title('Variable Importance')
  plt.xlabel('Coefficient')
  plt.ylabel('Feature')
  sns.barplot(data=df_plt,
                     y=df_plt['feature'],
                     x=df_plt['abs_coefficient'], color="lightblue")

  plt.show()
  print("-- rejected --")
  for i in reject_vars:
    print(f" {i}")

create pred_booked containing the columns I want to use to answer this question

In [81]:
# List of predictor variables for booking probability
pred_booked = df[['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds','bed_type','price','security_deposit',
    'cleaning_fee', 'number_of_reviews', 'review_scores_location',
       'review_scores_value','square_feet', 'host_response_time', 'host_response_rate', 'host_is_superhost',
    'neighbourhood', 'review_scores_rating', 'review_scores_accuracy',
    'review_scores_cleanliness', 'cancellation_policy', 'minimum_nights',
    'availability_30', 'availability_60', 'availability_365'
]]
# Create a new column 'booked' based on 90-day availability less than 40%
pred_booked['booked'] = (df['availability_90'] < 0.4 * 90).astype(int)
In [82]:
pred_booked.columns
Out[82]:
Index(['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'bed_type', 'price', 'security_deposit', 'cleaning_fee',
       'number_of_reviews', 'review_scores_location', 'review_scores_value',
       'square_feet', 'host_response_time', 'host_response_rate',
       'host_is_superhost', 'neighbourhood', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'cancellation_policy', 'minimum_nights', 'availability_30',
       'availability_60', 'availability_365', 'booked'],
      dtype='object')
In [83]:
import seaborn as sns
import matplotlib.pyplot as plt

columns_of_interest = ['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'bed_type', 'price', 'security_deposit', 'cleaning_fee',
       'number_of_reviews','review_scores_location',
       'review_scores_value', 'square_feet', 'host_response_time',
       'host_response_rate', 'host_is_superhost', 'neighbourhood',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'cancellation_policy', 'minimum_nights',
       'availability_30', 'availability_60', 'availability_365']

subset = pred_booked[columns_of_interest]

# Calculate the correlation matrix
correlation_matrix = subset.corr()

# Plotting the heatmap of the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Variables')
plt.show()

set up¶

In [84]:
for column in pred_booked.columns:
    if pred_booked[column].dtype == 'object':
        print(column)
property_type
room_type
bed_type
host_response_time
neighbourhood
cancellation_policy
In [85]:
# split the data into training and testing sets
X3 = pred_booked.drop(['booked'], axis=1)

X3 = pred_booked[['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'bed_type', 'price', 'security_deposit', 'cleaning_fee',
       'number_of_reviews','review_scores_location',
       'review_scores_value', 'square_feet', 'host_response_time',
       'host_response_rate', 'host_is_superhost', 'neighbourhood',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'cancellation_policy', 'minimum_nights',
       'availability_30', 'availability_60', 'availability_365']]

categorical_columns = ['property_type', 'room_type', 'bed_type', 'neighbourhood','host_response_time','cancellation_policy']


# Apply one-hot encoding to categorical columns
X3 = pd.get_dummies(X3, columns=categorical_columns, drop_first=True)

y3 = pred_booked['booked']

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.3, random_state=42)

X3_train.info()
y3_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 914 entries, 5414 to 4770
Columns: 116 entries, accommodates to cancellation_policy_super_strict_30
dtypes: float64(13), int64(7), uint8(96)
memory usage: 235.6 KB
<class 'pandas.core.series.Series'>
Int64Index: 914 entries, 5414 to 4770
Series name: booked
Non-Null Count  Dtype
--------------  -----
914 non-null    int64
dtypes: int64(1)
memory usage: 14.3 KB

Using l1 regularization¶

In [121]:
# Create an Instance of Logistic Regression for LASSO Selection  using c = 0.1 and c = 0.01

lr_l1_1 = LogisticRegression(penalty='l1', solver='liblinear', C=.1)
lr_l1_01 = LogisticRegression(penalty='l1', solver='liblinear', C= .01)

# fit the models to the training data
lr_l1_1.fit(X3_train, y3_train)
lr_l1_01.fit(X3_train, y3_train)

# Create an Instance of Logistic Regression for Ridge Regression (L2 regularization)
lr_l2 = LogisticRegression(penalty='l2', C = .01, solver='liblinear')

# Create an instance of Logistic Regression for Elastic Net
lr_l12 = LogisticRegression(penalty='elasticnet', l1_ratio=0.5, solver='saga')

# fit the models to the training data
lr_l2.fit(X3_train, y3_train)
lr_l12.fit(X3_train, y3_train)
Out[121]:
LogisticRegression(l1_ratio=0.5, penalty='elasticnet', solver='saga')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(l1_ratio=0.5, penalty='elasticnet', solver='saga')
In [122]:
# function for model coefficents
def rpt_model_variables(model):
    # Get the intercept term
    intercept = model.intercept_

    # Access the coefficients (weights) of the model, i rounded them
    coefficients = np.round(model.coef_[0],decimals=4)

    # Create DataFrames for intercept and coefficients
    #df_intercept = pd.DataFrame({'feature': ['Intercept'], 'coefficient': [intercept[0]]})
    df_coefficients = pd.DataFrame({'feature': X3.columns, 'coefficient': coefficients})
    df_coefficients['abs_coefficient'] = df_coefficients['coefficient'].abs()
    df_coefficients.sort_values(by='abs_coefficient', ascending=False, inplace=True)

    # if you want to add intercept to table
    #df_model = pd.concat([df_intercept, df_coefficients]).reset_index(drop=True)

    # Print the DataFrame
    print(df_coefficients)

    return df_coefficients
In [123]:
df_coefficients1 = rpt_model_variables(lr_l1_1)
df_coefficients01 = rpt_model_variables(lr_l1_01)
df_coefficients2 = rpt_model_variables(lr_l2)
df_coefficients12 = rpt_model_variables(lr_l12)
                                      feature  coefficient  abs_coefficient
18                            availability_60      -0.2195           0.2195
13                       review_scores_rating       0.0739           0.0739
17                            availability_30       0.0678           0.0678
16                             minimum_nights       0.0145           0.0145
7                           number_of_reviews      -0.0127           0.0127
..                                        ...          ...              ...
42   neighbourhood_Balcones Civic Association       0.0000           0.0000
41                neighbourhood_Anderson Mill       0.0000           0.0000
40                          bed_type_Real Bed       0.0000           0.0000
39                     bed_type_Pull-out Sofa       0.0000           0.0000
115       cancellation_policy_super_strict_30       0.0000           0.0000

[116 rows x 3 columns]
                                 feature  coefficient  abs_coefficient
18                       availability_60      -0.1620           0.1620
13                  review_scores_rating       0.0410           0.0410
7                      number_of_reviews      -0.0081           0.0081
19                      availability_365      -0.0073           0.0073
11                    host_response_rate       0.0031           0.0031
..                                   ...          ...              ...
41           neighbourhood_Anderson Mill       0.0000           0.0000
40                     bed_type_Real Bed       0.0000           0.0000
39                bed_type_Pull-out Sofa       0.0000           0.0000
38                        bed_type_Futon       0.0000           0.0000
115  cancellation_policy_super_strict_30       0.0000           0.0000

[116 rows x 3 columns]
                        feature  coefficient  abs_coefficient
18              availability_60      -0.2168           0.2168
17              availability_30       0.0665           0.0665
13         review_scores_rating       0.0610           0.0610
14       review_scores_accuracy       0.0562           0.0562
9           review_scores_value       0.0531           0.0531
..                          ...          ...              ...
32           property_type_Tipi      -0.0000           0.0000
100  neighbourhood_Walnut Creek      -0.0000           0.0000
22       property_type_Bungalow      -0.0000           0.0000
25         property_type_Chalet      -0.0000           0.0000
43   neighbourhood_Barton Creek       0.0000           0.0000

[116 rows x 3 columns]
                                 feature  coefficient  abs_coefficient
18                       availability_60      -0.0382           0.0382
17                       availability_30      -0.0164           0.0164
19                      availability_365      -0.0077           0.0077
7                      number_of_reviews      -0.0072           0.0072
11                    host_response_rate       0.0051           0.0051
..                                   ...          ...              ...
49            neighbourhood_Cherry Creek       0.0000           0.0000
48              neighbourhood_Bull Creek       0.0000           0.0000
47            neighbourhood_Bryker Woods       0.0000           0.0000
46               neighbourhood_Brentwood       0.0000           0.0000
115  cancellation_policy_super_strict_30       0.0000           0.0000

[116 rows x 3 columns]
In [124]:
# plot variable importance
def plot_variable_imp(df_coef):
  df_plt = df_coef[df_coef['abs_coefficient'] != 0]
  reject_vars = df_coef[df_coef['abs_coefficient'] == 0]['feature'].tolist()

  plt.figure(figsize=(5, 10))
  plt.title('Variable Importance')
  plt.xlabel('Coefficient')
  plt.ylabel('Feature')
  sns.barplot(data=df_plt,
                     y=df_plt['feature'],
                     x=df_plt['abs_coefficient'], color="lightblue")

  plt.show()
  print("-- rejected --")
  for i in reject_vars:
    print(f" {i}")


plot_variable_imp(df_coefficients1)
plot_variable_imp(df_coefficients01)
plot_variable_imp(df_coefficients2)
plot_variable_imp(df_coefficients12)
-- rejected --
 neighbourhood_Pleasant Valley
 neighbourhood_Old West Austin
 neighbourhood_Parker Lane
 neighbourhood_Pecan Spings
 neighbourhood_Rollingwood
 neighbourhood_Rainey Street
 neighbourhood_Oak Hill
 neighbourhood_Rosedale
 neighbourhood_Rosewood
 neighbourhood_SW Williamson Co.
 neighbourhood_South Congress
 neighbourhood_Old Enfield
 accommodates
 neighbourhood_Northwest Hills
 neighbourhood_North Shoal Creek
 neighbourhood_North Loop
 neighbourhood_South Lamar
 neighbourhood_Mueller
 neighbourhood_Montopolis
 neighbourhood_Milwood
 neighbourhood_McKinney
 neighbourhood_MLK & 183
 neighbourhood_Long Canyon
 neighbourhood_Lamplight Village
 neighbourhood_Hyde Park
 neighbourhood_South First
 neighbourhood_Tarrytown
 neighbourhood_South Manchaca
 neighbourhood_St. Edwards
 cancellation_policy_strict
 cancellation_policy_moderate
 host_response_time_within an hour
 host_response_time_within a few hours
 host_response_time_within a day
 neighbourhood_Zilker
 neighbourhood_Wooten
 neighbourhood_Windsor Park
 neighbourhood_Windsor Hills
 neighbourhood_Westlake Hills
 neighbourhood_Westgate
 neighbourhood_West Congress
 neighbourhood_West Campus
 neighbourhood_West Austin
 neighbourhood_Walnut Creek
 neighbourhood_Upper Boggy Creek
 neighbourhood_University of Texas
 neighbourhood_University Hills
 neighbourhood_Travis Heights
 neighbourhood_Highland
 neighbourhood_Sunset Valley
 neighbourhood_Steiner Ranch
 neighbourhood_St. Johns
 neighbourhood_Holly
 neighbourhood_East Riverside
 neighbourhood_Hancock
 neighbourhood_Gracywoods
 property_type_Villa
 property_type_Townhouse
 property_type_Tipi
 property_type_Tent
 property_type_Other
 property_type_Loft
 property_type_House
 property_type_Earth House
 property_type_Condominium
 property_type_Chalet
 property_type_Camper/RV
 property_type_Cabin
 property_type_Bungalow
 property_type_Boat
 property_type_Bed & Breakfast
 review_scores_cleanliness
 review_scores_accuracy
 host_is_superhost
 host_response_rate
 review_scores_value
 review_scores_location
 beds
 bedrooms
 room_type_Private room
 room_type_Shared room
 bed_type_Couch
 neighbourhood_Cherrywood
 neighbourhood_Govalle
 neighbourhood_Galindo
 bathrooms
 neighbourhood_East Downtown
 neighbourhood_East Congress
 neighbourhood_Downtown
 neighbourhood_Dawson
 neighbourhood_Crestview
 neighbourhood_Copperfield
 neighbourhood_Clarksville
 neighbourhood_Cherry Creek
 bed_type_Futon
 neighbourhood_Bull Creek
 neighbourhood_Bryker Woods
 neighbourhood_Brentwood
 neighbourhood_Bouldin Creek
 neighbourhood_Barton Hills
 neighbourhood_Barton Creek
 neighbourhood_Balcones Civic Association
 neighbourhood_Anderson Mill
 bed_type_Real Bed
 bed_type_Pull-out Sofa
 cancellation_policy_super_strict_30
-- rejected --
 neighbourhood_Rainey Street
 neighbourhood_Old West Austin
 neighbourhood_Parker Lane
 neighbourhood_Pecan Spings
 neighbourhood_Pleasant Valley
 accommodates
 neighbourhood_Rollingwood
 neighbourhood_Oak Hill
 neighbourhood_Rosedale
 neighbourhood_Rosewood
 neighbourhood_SW Williamson Co.
 neighbourhood_South Congress
 neighbourhood_Old Enfield
 neighbourhood_North Loop
 neighbourhood_Northwest Hills
 neighbourhood_North Shoal Creek
 neighbourhood_South Lamar
 neighbourhood_Mueller
 neighbourhood_Montopolis
 neighbourhood_Milwood
 neighbourhood_McKinney
 neighbourhood_MLK & 183
 neighbourhood_Long Canyon
 neighbourhood_Lamplight Village
 neighbourhood_Hyde Park
 neighbourhood_South First
 neighbourhood_Sunset Valley
 neighbourhood_South Manchaca
 neighbourhood_St. Edwards
 cancellation_policy_strict
 cancellation_policy_moderate
 host_response_time_within an hour
 host_response_time_within a few hours
 host_response_time_within a day
 neighbourhood_Zilker
 neighbourhood_Wooten
 neighbourhood_Windsor Park
 neighbourhood_Windsor Hills
 neighbourhood_Westlake Hills
 neighbourhood_Westgate
 neighbourhood_West Congress
 neighbourhood_West Campus
 neighbourhood_West Austin
 neighbourhood_Walnut Creek
 neighbourhood_Upper Boggy Creek
 neighbourhood_University of Texas
 neighbourhood_University Hills
 neighbourhood_Travis Heights
 neighbourhood_Tarrytown
 neighbourhood_Highland
 neighbourhood_Steiner Ranch
 neighbourhood_St. Johns
 neighbourhood_Holly
 neighbourhood_East Riverside
 neighbourhood_Hancock
 room_type_Private room
 property_type_Townhouse
 property_type_Tipi
 property_type_Tent
 property_type_Other
 property_type_Loft
 property_type_House
 property_type_Earth House
 property_type_Condominium
 property_type_Chalet
 property_type_Camper/RV
 property_type_Cabin
 property_type_Bungalow
 property_type_Boat
 property_type_Bed & Breakfast
 availability_30
 minimum_nights
 review_scores_cleanliness
 review_scores_accuracy
 host_is_superhost
 review_scores_value
 review_scores_location
 beds
 bedrooms
 property_type_Villa
 room_type_Shared room
 neighbourhood_Gracywoods
 bed_type_Couch
 neighbourhood_Govalle
 neighbourhood_Galindo
 bathrooms
 neighbourhood_East Downtown
 neighbourhood_East Congress
 neighbourhood_Downtown
 neighbourhood_Dawson
 neighbourhood_Crestview
 neighbourhood_Copperfield
 neighbourhood_Clarksville
 neighbourhood_Cherrywood
 neighbourhood_Cherry Creek
 neighbourhood_Bull Creek
 neighbourhood_Bryker Woods
 neighbourhood_Brentwood
 neighbourhood_Bouldin Creek
 neighbourhood_Barton Hills
 neighbourhood_Barton Creek
 neighbourhood_Balcones Civic Association
 neighbourhood_Anderson Mill
 bed_type_Real Bed
 bed_type_Pull-out Sofa
 bed_type_Futon
 cancellation_policy_super_strict_30
-- rejected --
 property_type_Villa
 neighbourhood_Bull Creek
 neighbourhood_Copperfield
 neighbourhood_Cherrywood
 property_type_Earth House
 property_type_Bed & Breakfast
 property_type_Boat
 neighbourhood_Rollingwood
 neighbourhood_Lamplight Village
 neighbourhood_Old Enfield
 neighbourhood_Rainey Street
 neighbourhood_St. Johns
 neighbourhood_Sunset Valley
 property_type_Tipi
 neighbourhood_Walnut Creek
 property_type_Bungalow
 property_type_Chalet
 neighbourhood_Barton Creek
-- rejected --
 neighbourhood_Pleasant Valley
 neighbourhood_South Lamar
 neighbourhood_South First
 neighbourhood_South Congress
 neighbourhood_SW Williamson Co.
 neighbourhood_Rosewood
 neighbourhood_Rosedale
 neighbourhood_Rollingwood
 neighbourhood_Rainey Street
 neighbourhood_Pecan Spings
 neighbourhood_Milwood
 neighbourhood_Parker Lane
 neighbourhood_Old West Austin
 neighbourhood_Old Enfield
 neighbourhood_Oak Hill
 neighbourhood_Northwest Hills
 neighbourhood_North Shoal Creek
 neighbourhood_North Loop
 neighbourhood_Mueller
 neighbourhood_St. Edwards
 neighbourhood_Montopolis
 neighbourhood_South Manchaca
 neighbourhood_Walnut Creek
 neighbourhood_St. Johns
 neighbourhood_Westgate
 cancellation_policy_strict
 cancellation_policy_moderate
 host_response_time_within a few hours
 host_response_time_within a day
 neighbourhood_Zilker
 neighbourhood_Wooten
 neighbourhood_Windsor Park
 neighbourhood_Windsor Hills
 neighbourhood_Westlake Hills
 neighbourhood_West Congress
 neighbourhood_Steiner Ranch
 neighbourhood_West Campus
 neighbourhood_West Austin
 neighbourhood_MLK & 183
 neighbourhood_Upper Boggy Creek
 neighbourhood_University of Texas
 neighbourhood_University Hills
 neighbourhood_Travis Heights
 neighbourhood_Tarrytown
 neighbourhood_Sunset Valley
 neighbourhood_McKinney
 neighbourhood_East Riverside
 neighbourhood_Long Canyon
 neighbourhood_Lamplight Village
 neighbourhood_Anderson Mill
 bed_type_Pull-out Sofa
 bed_type_Futon
 bed_type_Couch
 room_type_Shared room
 property_type_Villa
 property_type_Townhouse
 property_type_Tipi
 property_type_Tent
 property_type_Other
 property_type_Loft
 property_type_House
 property_type_Earth House
 property_type_Condominium
 property_type_Chalet
 property_type_Camper/RV
 property_type_Cabin
 property_type_Bungalow
 property_type_Boat
 property_type_Bed & Breakfast
 bedrooms
 neighbourhood_Balcones Civic Association
 neighbourhood_Barton Creek
 neighbourhood_Barton Hills
 neighbourhood_East Congress
 neighbourhood_Hyde Park
 neighbourhood_Holly
 neighbourhood_Highland
 neighbourhood_Hancock
 neighbourhood_Gracywoods
 neighbourhood_Govalle
 neighbourhood_Galindo
 bathrooms
 neighbourhood_East Downtown
 neighbourhood_Downtown
 neighbourhood_Bouldin Creek
 neighbourhood_Dawson
 neighbourhood_Crestview
 neighbourhood_Copperfield
 neighbourhood_Clarksville
 neighbourhood_Cherrywood
 neighbourhood_Cherry Creek
 neighbourhood_Bull Creek
 neighbourhood_Bryker Woods
 neighbourhood_Brentwood
 cancellation_policy_super_strict_30

Make Predictions¶

In [125]:
# make predictions on the testing data
y_pred_train = lr_l1_1.predict(X3_train)
y_pred_test = lr_l1_1.predict(X3_test)
y_proba_train = lr_l1_1.predict_proba(X3_train)
y_proba_test = lr_l1_1.predict_proba(X3_test)

y_pred_train1 = lr_l1_01.predict(X3_train)
y_pred_test1 = lr_l1_01.predict(X3_test)
y_proba_train1 = lr_l1_01.predict_proba(X3_train)
y_proba_test1 = lr_l1_01.predict_proba(X3_test)

y_pred_train2 = lr_l2.predict(X3_train)
y_pred_test2 = lr_l2.predict(X3_test)
y_proba_train2 = lr_l2.predict_proba(X3_train)
y_proba_test2 = lr_l2.predict_proba(X3_test)

y_pred_train12 = lr_l12.predict(X3_train)
y_pred_test12 = lr_l12.predict(X3_test)
y_proba_train12 = lr_l12.predict_proba(X3_train)
y_proba_test12 = lr_l12.predict_proba(X3_test)

Evaluate A, P, R for l1 c = .1¶

In [126]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

acc3_train = accuracy_score(y3_train, y_pred_train)
prec3_train = precision_score(y3_train, y_pred_train)
rec3_train = recall_score(y3_train, y_pred_train)
auc3_train = roc_auc_score(y3_train, y_proba_train[:, 1])
# Print the scores for the training set
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc3_train))
print("Precision: {:.4f}".format(prec3_train))
print("Recall   : {:.4f}".format(rec3_train))
print("AUC      : {:.4f}".format(auc3_train))
print("")



acc3_test = accuracy_score(y3_test, y_pred_test)
prec3_test = precision_score(y3_test, y_pred_test)
acc3_test = recall_score(y3_test, y_pred_test)
auc3_test = roc_auc_score(y3_test, lr_l1_1.predict_proba(X3_test)[:, 1])

print("Accuracy : {:.4f}".format(acc3_test))
print("Precision: {:.4f}".format(prec3_test))
print("Recall   : {:.4f}".format(acc3_test))
print("AUC      : {:.4f}".format(auc3_test))
 -- train set -- 
Accuracy : 0.9409
Precision: 0.8863
Recall   : 0.8618
AUC      : 0.9885

Accuracy : 0.8774
Precision: 0.8774
Recall   : 0.8774
AUC      : 0.9864

Evaluate A, P, R for l1 c =.01¶

In [127]:
# calculate the accuracy, precision, and recall scores
acc3_train1 = accuracy_score(y3_train, y_pred_train1)
prec3_train1 = precision_score(y3_train, y_pred_train1)
rec3_train1 = recall_score(y3_train, y_pred_train1)
auc3_train1 = roc_auc_score(y3_train, y_proba_train1[:,1])

# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc3_train1))
print("Precision: {:.4f}".format(prec3_train1))
print("Recall.  : {:.4f}".format(rec3_train1))
print("AUC      : {:.4f}".format(auc3_train1))
print("")

# calculate the accuracy, precision, and recall scores
acc3_test1 = accuracy_score(y3_test, y_pred_test1)
prec3_test1 = precision_score(y3_test, y_pred_test1)
rec3_test1 = recall_score(y3_test, y_pred_test1)
auc3_test1 = roc_auc_score(y3_test, y_proba_test1[:,1])

print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc3_test1))
print("Precision: {:.4f}".format(prec3_test1))
print("Recall.  : {:.4f}".format(rec3_test1))
print("AUC      : {:.4f}".format(auc3_test1))
 -- train set -- 
Accuracy : 0.9431
Precision: 0.8873
Recall.  : 0.8710
AUC      : 0.9872

 -- test set -- 
Accuracy : 0.9389
Precision: 0.8942
Recall.  : 0.8774
AUC      : 0.9867

Evaluate A, P, R for l2¶

In [128]:
# calculate the accuracy, precision, and recall scores
acc3_train2 = accuracy_score(y3_train, y_pred_train2)
prec3_train2 = precision_score(y3_train, y_pred_train2)
rec3_train2 = recall_score(y3_train, y_pred_train2)
auc3_train2 = roc_auc_score(y3_train, y_proba_train2[:,1])

# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc3_train2))
print("Precision: {:.4f}".format(prec3_train2))
print("Recall  : {:.4f}".format(rec3_train2))
print("AUC      : {:.4f}".format(auc3_train2))
print("")

# calculate the accuracy, precision, and recall scores
acc3_test2 = accuracy_score(y3_test, y_pred_test2)
prec3_test2 = precision_score(y3_test, y_pred_test2)
rec3_test2 = recall_score(y3_test, y_pred_test2)
auc3_test2 = roc_auc_score(y3_test, y_proba_test2[:,1])

print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc3_test2))
print("Precision: {:.4f}".format(prec3_test2))
print("Recall   : {:.4f}".format(rec3_test2))
print("AUC      : {:.4f}".format(auc3_test2))
 -- train set -- 
Accuracy : 0.9431
Precision: 0.8873
Recall  : 0.8710
AUC      : 0.9891

 -- test set -- 
Accuracy : 0.9364
Precision: 0.8785
Recall   : 0.8868
AUC      : 0.9867

Evaluate A, P, R for elastic net¶

In [130]:
# calculate the accuracy, precision, and recall scores
acc3_train12 = accuracy_score(y3_train, y_pred_train12)
prec3_train12 = precision_score(y3_train, y_pred_train12)
rec3_train12 = recall_score(y3_train, y_pred_train12)
auc3_train12 = roc_auc_score(y3_train, y_proba_train12[:,1])

# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc3_train12))
print("Precision: {:.4f}".format(prec3_train12))
print("Recall.  : {:.4f}".format(rec3_train12))
print("AUC      : {:.4f}".format(auc3_train12))
print("")

# calculate the accuracy, precision, and recall scores
acc3_test12 = accuracy_score(y3_test, y_pred_test12)
prec3_test12 = precision_score(y3_test, y_pred_test12)
rec3_test12 = recall_score(y3_test, y_pred_test12)
auc3_test12 = roc_auc_score(y3_test, y_proba_test12[:,1])

print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc3_test12))
print("Precision: {:.4f}".format(prec3_test12))
print("Recall.  : {:.4f}".format(rec3_test12))
print("AUC      : {:.4f}".format(auc3_test12))
 -- train set -- 
Accuracy : 0.9092
Precision: 0.8895
Recall.  : 0.7051
AUC      : 0.9675

 -- test set -- 
Accuracy : 0.8957
Precision: 0.8495
Recall.  : 0.7453
AUC      : 0.9632

Cluster¶

In [133]:
from IPython.core.display import display, HTML
from IPython.display import clear_output
display(HTML("<style>.container { width:90% }</style>"))
import warnings
warnings.filterwarnings('ignore')
# ------------------------------------------------------------------

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import statsmodels.api as sample_data
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
In [377]:
df.columns
Out[377]:
Index(['id', 'listing_url', 'name', 'summary', 'space', 'description',
       'experiences_offered', 'neighborhood_overview', 'notes', 'transit',
       'host_id', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_is_superhost',
       'host_listings_count', 'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood', 'city', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet',
       'price', 'weekly_price', 'security_deposit', 'cleaning_fee',
       'guests_included', 'extra_people', 'minimum_nights', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'instant_bookable',
       'cancellation_policy', 'booked'],
      dtype='object')

Create new dataframe to avoid confusion on my end

In [141]:
pred_cluster = df[[ 'host_response_rate', 'host_is_superhost',
'host_listings_count', 'host_has_profile_pic', 'host_identity_verified',
'accommodates', 'bathrooms', 'bedrooms', 'beds', 'square_feet', 'price',
'weekly_price', 'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
'minimum_nights', 'has_availability', 'availability_30', 'availability_60', 'availability_90',
'availability_365', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
'review_scores_location', 'review_scores_value', 'instant_bookable', 'booked']
]

get only numeric columns because this is a K MEANS clustering

Standardize Data Using StandardSCaler¶

In [142]:
#Standardize Data

dfcolumns = pred_cluster.columns.values.tolist()

scaler = StandardScaler()

df_scaled = scaler.fit_transform(pred_cluster.to_numpy())
df_scaled = pd.DataFrame(df_scaled, columns=dfcolumns)


df_scaled.head()
Out[142]:
host_response_rate host_is_superhost host_listings_count host_has_profile_pic host_identity_verified accommodates bathrooms bedrooms beds square_feet ... number_of_reviews review_scores_rating review_scores_accuracy review_scores_cleanliness review_scores_checkin review_scores_communication review_scores_location review_scores_value instant_bookable booked
0 0.452536 1.660044 -0.090166 0.027671 0.483925 -0.892166 -0.642284 -0.651249 -0.782890 1.145794e-15 ... -0.298505 0.823894 5.312191e-01 5.790691e-01 3.219625e-01 0.303469 0.752130 8.037821e-01 2.756571 -0.572933
1 0.452536 -0.602394 -0.163047 0.027671 0.483925 4.121610 2.321408 3.009752 5.296948 1.145794e-15 ... -0.468815 0.823894 5.312191e-01 5.790691e-01 3.219625e-01 0.303469 0.752130 8.037821e-01 -0.362770 -0.572933
2 0.452536 1.660044 -0.090166 0.027671 0.483925 -0.892166 -0.642284 -0.651249 -0.782890 1.145794e-15 ... -0.264443 0.823894 5.312191e-01 5.790691e-01 3.219625e-01 0.303469 0.752130 8.037821e-01 -0.362770 -0.572933
3 0.452536 -0.602394 -0.163047 0.027671 0.483925 -0.892166 -0.642284 -0.651249 -0.782890 1.145794e-15 ... -0.605063 0.000000 -3.150333e-15 -2.536195e-15 -4.803236e-15 0.000000 0.000000 2.753072e-15 2.756571 -0.572933
4 0.000000 -0.602394 -0.163047 0.027671 0.483925 -1.250293 -0.642284 -0.651249 -0.174906 1.145794e-15 ... -0.536939 -3.439806 -1.242261e+00 -8.486816e-01 3.219625e-01 -2.555532 -0.814807 -7.460599e-01 -0.362770 -0.572933

5 rows × 32 columns

How many clusters¶

In [143]:
# set the model type to k-means
model = KMeans()

# generate an elbow plot for k = 2 to 12 clusters using the scaled data
visualizer = KElbowVisualizer(model, k=(2, 12))
visualizer.fit(df_scaled)
visualizer.show()
Out[143]:
<Axes: title={'center': 'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
In [146]:
# Create an instance of KMeans with the desired value of k and fit to the scaled data
kmeans = KMeans(n_clusters=4, random_state=904)
kmclus=kmeans.fit(df_scaled)

clusters = kmeans.fit_predict(df_scaled)

#append cluster to original data
pred_cluster['cluster']=clusters

pred_booked.info()

# Evaluate cluster sizes to insure that they are similar
print(pred_cluster['cluster'].value_counts())

# Examine the means of each variable by cluster
df_desc = pred_cluster.groupby('cluster').mean()
print(df_desc)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1307 entries, 44 to 5830
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   property_type              1307 non-null   object 
 1   room_type                  1307 non-null   object 
 2   accommodates               1307 non-null   int64  
 3   bathrooms                  1307 non-null   float64
 4   bedrooms                   1307 non-null   float64
 5   beds                       1307 non-null   float64
 6   bed_type                   1307 non-null   object 
 7   price                      1307 non-null   float64
 8   security_deposit           1307 non-null   float64
 9   cleaning_fee               1307 non-null   float64
 10  number_of_reviews          1307 non-null   int64  
 11  review_scores_location     1307 non-null   float64
 12  review_scores_value        1307 non-null   float64
 13  square_feet                1307 non-null   float64
 14  host_response_time         1115 non-null   object 
 15  host_response_rate         1307 non-null   float64
 16  host_is_superhost          1307 non-null   int64  
 17  neighbourhood              1307 non-null   object 
 18  review_scores_rating       1307 non-null   float64
 19  review_scores_accuracy     1307 non-null   float64
 20  review_scores_cleanliness  1307 non-null   float64
 21  cancellation_policy        1307 non-null   object 
 22  minimum_nights             1307 non-null   int64  
 23  availability_30            1307 non-null   int64  
 24  availability_60            1307 non-null   int64  
 25  availability_365           1307 non-null   int64  
dtypes: float64(13), int64(7), object(6)
memory usage: 275.7+ KB
0    681
1    310
2    211
3    105
Name: cluster, dtype: int64
         host_response_rate  host_is_superhost  host_listings_count  \
cluster                                                               
0                 95.566201           0.301028             2.559471   
1                 95.426773           0.212903             2.258065   
2                 94.891790           0.345972             5.099526   
3                 93.589716           0.038095             6.780952   

         host_has_profile_pic  host_identity_verified  accommodates  \
cluster                                                               
0                    0.998532                0.787078      3.324523   
1                    1.000000                0.787097      4.158065   
2                    1.000000                0.900474      8.706161   
3                    1.000000                0.847619      4.571429   

         bathrooms  bedrooms      beds  square_feet  ...  number_of_reviews  \
cluster                                              ...                      
0         1.160059  1.243759  1.665198  1122.228648  ...          22.967695   
1         1.379032  1.641935  2.035484  1144.669580  ...           9.506452   
2         2.433649  3.379147  4.720379  1232.594566  ...          14.530806   
3         1.357143  1.600000  2.180952  1139.206866  ...          14.885714   

         review_scores_rating  review_scores_accuracy  \
cluster                                                 
0                   97.148616                9.812407   
1                   96.393516                9.754662   
2                   97.410680                9.797694   
3                   86.238095                8.619048   

         review_scores_cleanliness  review_scores_checkin  \
cluster                                                     
0                         9.743037               9.952098   
1                         9.611802               9.924033   
2                         9.726247               9.940981   
3                         8.314286               9.171429   

         review_scores_communication  review_scores_location  \
cluster                                                        
0                           9.962920                9.611043   
1                           9.927735                9.585290   
2                           9.947497                9.464834   
3                           9.238095                8.847619   

         review_scores_value  instant_bookable    booked  
cluster                                                   
0                   9.634400          0.136564  0.001468  
1                   9.507016          0.087097  0.961290  
2                   9.473744          0.085308  0.085308  
3                   8.428571          0.133333  0.057143  

[4 rows x 32 columns]

Column of interest = Price¶

In [147]:
#day charge eve charge night charge Intl min
plt.figure(figsize=(8,4))
print("Highest Nightly Rates")
sns.displot(data=pred_cluster, x="price", bins=8, col='cluster', col_wrap=2, common_bins=True)
plt.show()
Highest Nightly Rates
<Figure size 800x400 with 0 Axes>

Column of interest = review_scores_rating¶

In [148]:
#day charge eve charge night charge Intl min
plt.figure(figsize=(8,4))
print("Review Scores Rating")
sns.displot(data=pred_cluster, x="review_scores_rating", bins=8, col='cluster', col_wrap=2, common_bins=True)
plt.show()
Review Scores Rating
<Figure size 800x400 with 0 Axes>

Column of interest = review_scores_value¶

In [150]:
#day charge eve charge night charge Intl min
plt.figure(figsize=(8,4))
print("Review Scores Value")
sns.displot(data=pred_cluster, x="review_scores_value", bins=8, col='cluster', col_wrap=2, common_bins=True)
plt.show()
Review Scores Value
<Figure size 800x400 with 0 Axes>

Column of interest = Booked¶

In [149]:
#day charge eve charge night charge Intl min
plt.figure(figsize=(8,4))
print("Booked")
sns.displot(data=pred_cluster, x="booked", bins=8, col='cluster', col_wrap=2, common_bins=True)
plt.show()
Booked
<Figure size 800x400 with 0 Axes>

Build Segmented Model¶

In [222]:
df2 = pred_cluster
In [223]:
#split the data into groups based on clusters
c0 = df2[df2['cluster'] == 0]
c1 = df2[df2['cluster'] == 1]
c2 = df2[df2['cluster'] == 2]
c3 = df2[df2['cluster'] == 3]
In [224]:
# split the data into training and testing sets
X4 = c3.drop(['booked', 'cluster', 'availability_90'], axis=1)

y4 = c3['booked']
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.3, random_state=42)

X4_train.info()
y4_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 73 entries, 5800 to 5774
Data columns (total 30 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   host_response_rate           73 non-null     float64
 1   host_is_superhost            73 non-null     int64  
 2   host_listings_count          73 non-null     float64
 3   host_has_profile_pic         73 non-null     int64  
 4   host_identity_verified       73 non-null     int64  
 5   accommodates                 73 non-null     int64  
 6   bathrooms                    73 non-null     float64
 7   bedrooms                     73 non-null     float64
 8   beds                         73 non-null     float64
 9   square_feet                  73 non-null     float64
 10  price                        73 non-null     float64
 11  weekly_price                 73 non-null     float64
 12  security_deposit             73 non-null     float64
 13  cleaning_fee                 73 non-null     float64
 14  guests_included              73 non-null     int64  
 15  extra_people                 73 non-null     float64
 16  minimum_nights               73 non-null     int64  
 17  has_availability             73 non-null     int64  
 18  availability_30              73 non-null     int64  
 19  availability_60              73 non-null     int64  
 20  availability_365             73 non-null     int64  
 21  number_of_reviews            73 non-null     int64  
 22  review_scores_rating         73 non-null     float64
 23  review_scores_accuracy       73 non-null     float64
 24  review_scores_cleanliness    73 non-null     float64
 25  review_scores_checkin        73 non-null     float64
 26  review_scores_communication  73 non-null     float64
 27  review_scores_location       73 non-null     float64
 28  review_scores_value          73 non-null     float64
 29  instant_bookable             73 non-null     int64  
dtypes: float64(18), int64(12)
memory usage: 17.7 KB
<class 'pandas.core.series.Series'>
Int64Index: 73 entries, 5800 to 5774
Series name: booked
Non-Null Count  Dtype
--------------  -----
73 non-null     int64
dtypes: int64(1)
memory usage: 1.1 KB

Attempt at creating models for each group¶

In [225]:
dt_tune = DecisionTreeClassifier()

param_grid = {
    'max_depth': [None, 5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 10, 20, 50, 100],
    'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}

grid_search = GridSearchCV(dt_tune, param_grid, cv=5)
grid_search.fit(X4_train, y4_train)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print(best_params)
{'ccp_alpha': 0, 'max_depth': 10, 'min_samples_leaf': 1}
In [226]:
# create an instance of a decision tree classifier using default values

dt = DecisionTreeClassifier(max_depth = None, min_samples_leaf=1, ccp_alpha = 0.001)

# fit the model to the training data
dt.fit(X4_train, y4_train)
Out[226]:
DecisionTreeClassifier(ccp_alpha=0.001)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.001)
In [227]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
export_graphviz(dt,
                'tree.dot',
                class_names=['0','1'],
                feature_names = X4_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[227]:
<matplotlib.image.AxesImage at 0x7d0b3e624190>
In [228]:
# make predictions on the training and test data
y4_pred_train = dt.predict(X4_train)
y4_pred_test = dt.predict(X4_test)

y4_prob_train = dt.predict_proba(X4_train)
y4_prob_test = dt.predict_proba(X4_test)
In [229]:
# calculate the accuracy, precision, and recall scores
acc_train = accuracy_score(y4_train, y4_pred_train)
prec_train = precision_score(y4_train, y4_pred_train)
rec_train = recall_score(y4_train, y4_pred_train)

# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores
acc_test = accuracy_score(y4_test, y4_pred_test)
prec_test = precision_score(y4_test, y4_pred_test)
rec_test = recall_score(y4_test, y4_pred_test)

print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
 -- train set -- 
Accuracy : 1.0000
Precision: 1.0000
Recall.  : 1.0000

 -- test set -- 
Accuracy : 0.9688
Precision: 1.0000
Recall.  : 0.5000

Segmented Model or Cluster 3¶

In [230]:
# split the data into training and testing sets
X5 = c2.drop(['booked', 'cluster', 'availability_90'], axis=1)

y5 = c2['booked']
X5_train, X5_test, y5_train, y5_test = train_test_split(X5, y5, test_size=0.3, random_state=42)

X5_train.info()
y5_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 5557 to 2503
Data columns (total 30 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   host_response_rate           147 non-null    float64
 1   host_is_superhost            147 non-null    int64  
 2   host_listings_count          147 non-null    float64
 3   host_has_profile_pic         147 non-null    int64  
 4   host_identity_verified       147 non-null    int64  
 5   accommodates                 147 non-null    int64  
 6   bathrooms                    147 non-null    float64
 7   bedrooms                     147 non-null    float64
 8   beds                         147 non-null    float64
 9   square_feet                  147 non-null    float64
 10  price                        147 non-null    float64
 11  weekly_price                 147 non-null    float64
 12  security_deposit             147 non-null    float64
 13  cleaning_fee                 147 non-null    float64
 14  guests_included              147 non-null    int64  
 15  extra_people                 147 non-null    float64
 16  minimum_nights               147 non-null    int64  
 17  has_availability             147 non-null    int64  
 18  availability_30              147 non-null    int64  
 19  availability_60              147 non-null    int64  
 20  availability_365             147 non-null    int64  
 21  number_of_reviews            147 non-null    int64  
 22  review_scores_rating         147 non-null    float64
 23  review_scores_accuracy       147 non-null    float64
 24  review_scores_cleanliness    147 non-null    float64
 25  review_scores_checkin        147 non-null    float64
 26  review_scores_communication  147 non-null    float64
 27  review_scores_location       147 non-null    float64
 28  review_scores_value          147 non-null    float64
 29  instant_bookable             147 non-null    int64  
dtypes: float64(18), int64(12)
memory usage: 35.6 KB
<class 'pandas.core.series.Series'>
Int64Index: 147 entries, 5557 to 2503
Series name: booked
Non-Null Count  Dtype
--------------  -----
147 non-null    int64
dtypes: int64(1)
memory usage: 2.3 KB
In [231]:
dt_tune = DecisionTreeClassifier()

param_grid = {
    'max_depth': [None, 5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 10, 20, 50, 100],
    'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}

grid_search = GridSearchCV(dt_tune, param_grid, cv=5)
grid_search.fit(X5_train, y5_train)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print(best_params)
{'ccp_alpha': 0, 'max_depth': None, 'min_samples_leaf': 10}
In [232]:
# create an instance of a decision tree classifier using default values

dt = DecisionTreeClassifier(max_depth = None, min_samples_leaf=10, ccp_alpha = 0.001)

# fit the model to the training data
dt.fit(X5_train, y5_train)
Out[232]:
DecisionTreeClassifier(ccp_alpha=0.001, min_samples_leaf=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.001, min_samples_leaf=10)
In [233]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
export_graphviz(dt,
                'tree.dot',
                class_names=['0','1'],
                feature_names = X5_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[233]:
<matplotlib.image.AxesImage at 0x7d0b3e317430>
In [234]:
# make predictions on the training and test data
y5_pred_train = dt.predict(X5_train)
y5_pred_test = dt.predict(X5_test)

y5_prob_train = dt.predict_proba(X5_train)
y5_prob_test = dt.predict_proba(X5_test)
In [235]:
# calculate the accuracy, precision, and recall scores
acc_train = accuracy_score(y5_train, y5_pred_train)
prec_train = precision_score(y5_train, y5_pred_train)
rec_train = recall_score(y5_train, y5_pred_train)

# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores
acc_test = accuracy_score(y5_test, y5_pred_test)
prec_test = precision_score(y5_test, y5_pred_test)
rec_test = recall_score(y5_test, y5_pred_test)

print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
 -- train set -- 
Accuracy : 0.9864
Precision: 0.9286
Recall.  : 0.9286

 -- test set -- 
Accuracy : 0.9844
Precision: 1.0000
Recall.  : 0.7500

Segemented Model for Cluster 2¶

In [236]:
# split the data into training and testing sets
X6 = c1.drop(['booked', 'cluster', 'availability_90'], axis=1)

y6 = c1['booked']
X6_train, X6_test, y6_train, y6_test = train_test_split(X6, y6, test_size=0.3, random_state=42)

X6_train.info()
y6_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 217 entries, 5490 to 1730
Data columns (total 30 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   host_response_rate           217 non-null    float64
 1   host_is_superhost            217 non-null    int64  
 2   host_listings_count          217 non-null    float64
 3   host_has_profile_pic         217 non-null    int64  
 4   host_identity_verified       217 non-null    int64  
 5   accommodates                 217 non-null    int64  
 6   bathrooms                    217 non-null    float64
 7   bedrooms                     217 non-null    float64
 8   beds                         217 non-null    float64
 9   square_feet                  217 non-null    float64
 10  price                        217 non-null    float64
 11  weekly_price                 217 non-null    float64
 12  security_deposit             217 non-null    float64
 13  cleaning_fee                 217 non-null    float64
 14  guests_included              217 non-null    int64  
 15  extra_people                 217 non-null    float64
 16  minimum_nights               217 non-null    int64  
 17  has_availability             217 non-null    int64  
 18  availability_30              217 non-null    int64  
 19  availability_60              217 non-null    int64  
 20  availability_365             217 non-null    int64  
 21  number_of_reviews            217 non-null    int64  
 22  review_scores_rating         217 non-null    float64
 23  review_scores_accuracy       217 non-null    float64
 24  review_scores_cleanliness    217 non-null    float64
 25  review_scores_checkin        217 non-null    float64
 26  review_scores_communication  217 non-null    float64
 27  review_scores_location       217 non-null    float64
 28  review_scores_value          217 non-null    float64
 29  instant_bookable             217 non-null    int64  
dtypes: float64(18), int64(12)
memory usage: 52.6 KB
<class 'pandas.core.series.Series'>
Int64Index: 217 entries, 5490 to 1730
Series name: booked
Non-Null Count  Dtype
--------------  -----
217 non-null    int64
dtypes: int64(1)
memory usage: 3.4 KB
In [237]:
dt_tune = DecisionTreeClassifier()

param_grid = {
    'max_depth': [None, 5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 10, 20, 50, 100],
    'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}

grid_search = GridSearchCV(dt_tune, param_grid, cv=5)
grid_search.fit(X6_train, y6_train)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print(best_params)
{'ccp_alpha': 0.01, 'max_depth': None, 'min_samples_leaf': 1}
In [238]:
# create an instance of a decision tree classifier using default values

dt = DecisionTreeClassifier(max_depth = None, min_samples_leaf=1, ccp_alpha = 0.001)

# fit the model to the training data
dt.fit(X6_train, y6_train)
Out[238]:
DecisionTreeClassifier(ccp_alpha=0.001)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.001)
In [239]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
export_graphviz(dt,
                'tree.dot',
                class_names=['0','1'],
                feature_names = X6_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[239]:
<matplotlib.image.AxesImage at 0x7d0b3e0f5b70>
In [240]:
# make predictions on the training and test data
y6_pred_train = dt.predict(X6_train)
y6_pred_test = dt.predict(X6_test)

y6_prob_train = dt.predict_proba(X6_train)
y6_prob_test = dt.predict_proba(X6_test)
In [241]:
# calculate the accuracy, precision, and recall scores
acc_train = accuracy_score(y6_train, y6_pred_train)
prec_train = precision_score(y6_train, y6_pred_train)
rec_train = recall_score(y6_train, y6_pred_train)

# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores
acc_test = accuracy_score(y6_test, y6_pred_test)
prec_test = precision_score(y6_test, y6_pred_test)
rec_test = recall_score(y6_test, y6_pred_test)

print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
 -- train set -- 
Accuracy : 1.0000
Precision: 1.0000
Recall.  : 1.0000

 -- test set -- 
Accuracy : 0.9140
Precision: 0.9326
Recall.  : 0.9765

Segemented Model for Cluster 1¶

In [242]:
# split the data into training and testing sets
X7 = c0.drop(['booked', 'cluster', 'availability_90'], axis=1)

y7 = c0['booked']
X7_train, X7_test, y7_train, y7_test = train_test_split(X7, y7, test_size=0.3, random_state=42)

X7_train.info()
y7_train.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 476 entries, 3763 to 921
Data columns (total 30 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   host_response_rate           476 non-null    float64
 1   host_is_superhost            476 non-null    int64  
 2   host_listings_count          476 non-null    float64
 3   host_has_profile_pic         476 non-null    int64  
 4   host_identity_verified       476 non-null    int64  
 5   accommodates                 476 non-null    int64  
 6   bathrooms                    476 non-null    float64
 7   bedrooms                     476 non-null    float64
 8   beds                         476 non-null    float64
 9   square_feet                  476 non-null    float64
 10  price                        476 non-null    float64
 11  weekly_price                 476 non-null    float64
 12  security_deposit             476 non-null    float64
 13  cleaning_fee                 476 non-null    float64
 14  guests_included              476 non-null    int64  
 15  extra_people                 476 non-null    float64
 16  minimum_nights               476 non-null    int64  
 17  has_availability             476 non-null    int64  
 18  availability_30              476 non-null    int64  
 19  availability_60              476 non-null    int64  
 20  availability_365             476 non-null    int64  
 21  number_of_reviews            476 non-null    int64  
 22  review_scores_rating         476 non-null    float64
 23  review_scores_accuracy       476 non-null    float64
 24  review_scores_cleanliness    476 non-null    float64
 25  review_scores_checkin        476 non-null    float64
 26  review_scores_communication  476 non-null    float64
 27  review_scores_location       476 non-null    float64
 28  review_scores_value          476 non-null    float64
 29  instant_bookable             476 non-null    int64  
dtypes: float64(18), int64(12)
memory usage: 115.3 KB
<class 'pandas.core.series.Series'>
Int64Index: 476 entries, 3763 to 921
Series name: booked
Non-Null Count  Dtype
--------------  -----
476 non-null    int64
dtypes: int64(1)
memory usage: 7.4 KB
In [254]:
dt_tune = DecisionTreeClassifier()

param_grid = {
    'max_depth': [None, 5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 10, 20, 50, 100],
    'ccp_alpha': [0, 0.001, 0.01, 0.1, 0.5, 1]
}

grid_search = GridSearchCV(dt_tune, param_grid, cv=5)
grid_search.fit(X7_train, y7_train)

best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print(best_params)
{'ccp_alpha': 0, 'max_depth': None, 'min_samples_leaf': 1}
In [255]:
# create an instance of a decision tree classifier using default values

dt = DecisionTreeClassifier(max_depth = 10, min_samples_leaf=1, ccp_alpha = 0.001)

# fit the model to the training data
dt.fit(X7_train, y7_train)
Out[255]:
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier(ccp_alpha=0.001, max_depth=10)
In [256]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
export_graphviz(dt,
                'tree.dot',
                class_names=['0','1'],
                feature_names = X7_train.columns)
! dot -Tpng tree.dot -o tree.png

import matplotlib.pyplot as plt
import cv2
%matplotlib inline
img = cv2.imread('tree.png')
plt.figure(figsize = (20, 40))
plt.imshow(img)
Out[256]:
<matplotlib.image.AxesImage at 0x7d0b5ac89090>
In [257]:
# make predictions on the training and test data
y7_pred_train = dt.predict(X7_train)
y7_pred_test = dt.predict(X7_test)

y7_prob_train = dt.predict_proba(X7_train)
y7_prob_test = dt.predict_proba(X7_test)
In [258]:
# calculate the accuracy, precision, and recall scores
acc_train = accuracy_score(y7_train, y7_pred_train)
prec_train = precision_score(y7_train, y7_pred_train)
rec_train = recall_score(y7_train, y7_pred_train)

# print the scores
print(" -- train set -- ")
print("Accuracy : {:.4f}".format(acc_train))
print("Precision: {:.4f}".format(prec_train))
print("Recall.  : {:.4f}".format(rec_train))
print("")

# calculate the accuracy, precision, and recall scores
acc_test = accuracy_score(y7_test, y7_pred_test)
prec_test = precision_score(y7_test, y7_pred_test)
rec_test = recall_score(y7_test, y7_pred_test)

print(" -- test set -- ")
print("Accuracy : {:.4f}".format(acc_test))
print("Precision: {:.4f}".format(prec_test))
print("Recall.  : {:.4f}".format(rec_test))
 -- train set -- 
Accuracy : 1.0000
Precision: 1.0000
Recall.  : 1.0000

 -- test set -- 
Accuracy : 0.9902
Precision: 0.0000
Recall.  : 0.0000

Weird output

In [ ]: